*************************************************
*************************************************
*** 01a_get_cen_acs_viet                      ***
*************************************************
*************************************************

*** FILE: Pulls all source data for the AHA immigrants at relevant ages and arrival cohorts;
*** DECENNIAL & ACS Vietnamese;

* Individuals from cleaned ACS & Decennial household data
* Men
cd /.../destcenacs/2021/
use cenacs_men_pik.dta
keep if qpobst=="247"
destring age qyr2us, replace
keep if qyr2us>1988&qyr2us<1995
gen dby_cen=substr(qdb,1,4) if fromacs==0
replace dby=dby_cen if fromacs==0
destring dby, replace
keep if dby>1961 & dby<1976
gen age_at_entry=qyr2us-dby
summ age_at_entry if age_at_entry<18
summ age_at_entry if age_at_entry>17 & age_at_entry<22
destring qpobst, replace
keep cmid pnum age sex qrace1 qpobst qcitizen qyr2us qwklyrwk qwklyrhr qincwg qincse qinctot rel qspan qms schl qmig qmigpl qmigst qmigco esr qwklwk qlayoff qabsent qrecall qlookwk qbackwk qlastwk ind occ qcow msp chtot state pseq pik sedf year_obs dby age_at_entry qspeak qengabil
sort pik
save /.../immig/men_acsdecen_viet.dta, replace
keep pik
duplicates drop
drop if pik==""
saveold /.../immig/men_acsdecen_viet_pik.dta, replace
export sasxport5 "/.../pikmen", rename replace
clear

* Women
cd /.../destcenacs/2021/
use cenacs_women_pik.dta
keep if qpobst=="247"
destring age qyr2us, replace
keep if qyr2us>1988&qyr2us<1995
gen dby_cen=substr(qdb,1,4) if fromacs==0
replace dby=dby_cen if fromacs==0
destring dby, replace
order cmid pnum fromacs age sex qdb dby dby_cen qrace1 qpobst qcitizen qyr2us
keep if dby>1961 & dby<1976
gen age_at_entry=qyr2us-dby
summ age_at_entry if age_at_entry<18
summ age_at_entry if age_at_entry>17 & age_at_entry<22
destring qpobst, replace
keep cmid pnum age sex qrace1 qpobst qcitizen qyr2us qwklyrwk qwklyrhr qincwg qincse qinctot rel qspan qms schl qmig qmigpl qmigst qmigco esr qwklwk qlayoff qabsent qrecall qlookwk qbackwk qlastwk ind occ qcow msp chtot state pseq pik sedf year_obs dby age_at_entry qspeak qengabil
sort pik
save /.../immig/women_acsdecen_viet.dta, replace
keep pik
duplicates drop
drop if pik==""
saveold /.../immig/women_acsdecen_viet_pik.dta, replace
export sasxport5 "/.../pikwom", rename replace
clear

* Combined
use "/.../immig/women_acsdecen_viet.dta", replace
append using "/.../immig/men_acsdecen_viet.dta"
sort pik
destring year_obs, replace
gen sourcedata="Census" if year_obs==2000
replace sourcedata="ACS" if year_obs>2000 & year_obs!=.
save "/.../immig/both_acsdecen_variables.dta", replace

*** Generate full earnings histories
* LEHD EHF Data: MEN
global ehf "/.../lehd/ehf_dta"
foreach st in ar az ca co dc de ia il in ks md me mt nd ne nm nv ok pa tn tx va wa {
!gunzip "$ehf/ehf_`st'.dta.gz"
}

use "$ehf/ehf_ar.dta", clear
gen state="ar"
keep pik state year earn_ann earn1 earn2 earn3 earn4 sein
drop if year< 1993		
drop if year == 2015
sort pik
merge m:1 pik using  /.../immig/men_acsdecen_viet_pik.dta
keep if _merge==3
drop _merge

foreach st in az co dc de ia il in ks md me mt nd ne nm nv ok pa tn tx va wa ca {
append using "$ehf/ehf_`st'.dta"
replace state="`st'" if state==""
keep pik state year earn_ann earn1 earn2 earn3 earn4 sein
drop if year< 1993		
drop if year == 2015
sort pik
merge m:1 pik using  /.../immig/men_acsdecen_viet_pik.dta
keep if _merge==3
drop _merge
save /.../immig/lehd/cenacs_men_ehf.dta, replace
}
		bysort pik year: egen double maxearn = max(earn_ann) 
	        bysort pik year: egen totearn_all = sum(earn_ann) 
		bysort pik year sein: gen job=_n
		replace job=0 if job>1
		bysort pik year: egen njobs_all = sum(job)
		drop job
		forval i=1/4 { 
		bysort pik year: egen totq`i'=sum(earn`i')
		gen work`i' = totq`i'>0 & totq`i'!=.
		}
		egen nqtr_all=rsum(work1 work2 work3 work4)
		drop work1 work2 work3 work4

* Main job each year 
		drop if earn_ann < maxearn 
		forval i=1/4 { 
			gen pearn`i' = (earn`i'>0) 
		} 

***  BLS All Urban Consumers CPI, make quarterly deflator (df1-df4);
sort year 
merge m:1 year using  /.../immig/cpi.dta
keep if _merge==3
drop _merge

		forval i=1/4 { 
			replace df`i' = 1/df`i' 
			replace earn`i' = earn`i'*df`i' 
		} 	
		
		drop earn_ann 
		gen earn_ann = earn1 + earn2 + earn3 + earn4 
		gen totq = pearn1+pearn2+pearn3+pearn4 
		label variable totq "Total Quarters Worked in Main Job" 
		gen qrtearn = maxearn/totq 
		label variable qrtearn "Mean Quarterly Earnings in Main Job" 
		bysort pik year (sein): gen dup = _N 
		bysort pik year (sein): gen cnt = _n 	
		drop if cnt != 1 
		drop cnt dup 
		keep pik year sein totq qrtearn totearn_all njobs_all nqtr_all state
		rename state st_lehd

sort pik year
save /.../immig/lehd/cenacs_men_ehf.dta, replace

sort pik year
merge m:1 pik using /.../immig/men_acsdecen_viet.dta
keep if _merge==3
drop _merge
drop age
gen age=year-dby
summ totq qrtearn if age>29&age<36 & age_at_entry<18
summ totq qrtearn if age>29&age<36 & age_at_entry>17 & age_at_entry<22
bysort pik: gen n=_n
replace n=0 if n>1
tab n if age_at_entry<18
tab n if age_at_entry>17 & age_at_entry<22
drop n
summ age if age_at_entry<18
summ age if age_at_entry>17 & age_at_entry<22
summ qrtearn if age >29&age<36 & age_at_entry<18
summ qrtearn if age >29&age<36 & age_at_entry>17 & age_at_entry<22
save /.../immig/men_acsdecen_viet_ehf.dta, replace
clear

*  LEHD EHF Data: WOMEN
global ehf "/.../lehd/ehf_dta"
foreach st in ar az ca co dc de ia il in ks md me mt nd ne nm nv ok pa tn tx va wa {
!gunzip "$ehf/ehf_`st'.dta.gz"
}

use "$ehf/ehf_ar.dta", clear
gen state="ar"
keep pik state year earn_ann earn1 earn2 earn3 earn4 sein
drop if year< 1993		
drop if year == 2015
sort pik
merge m:1 pik using  /p.../immig/women_acsdecen_viet_pik.dta
keep if _merge==3
drop _merge

foreach st in az co dc de ia il in ks md me mt nd ne nm nv ok pa tn tx va wa ca {
append using "$ehf/ehf_`st'.dta"
replace state="`st'" if state==""
keep pik state year earn_ann earn1 earn2 earn3 earn4 sein
drop if year< 1993		
drop if year == 2015
sort pik
merge m:1 pik using  /.../immig/women_acsdecen_viet_pik.dta
keep if _merge==3
drop _merge
save /.../immig/lehd/cenacs_women_ehf.dta, replace
}
		bysort pik year: egen double maxearn = max(earn_ann) 
       		bysort pik year: egen totearn_all = sum(earn_ann) 
		bysort pik year sein: gen job=_n
		replace job=0 if job>1
		bysort pik year: egen njobs_all = sum(job)
		drop job
		forval i=1/4 { 
		bysort pik year: egen totq`i'=sum(earn`i')
		gen work`i' = totq`i'>0 & totq`i'!=.
		}
		egen nqtr_all=rsum(work1 work2 work3 work4)
		drop work1 work2 work3 work4

* Main job each year 
		drop if earn_ann < maxearn 
		forval i=1/4 { 
			gen pearn`i' = (earn`i'>0) 
		} 
		gen totq = pearn1+pearn2+pearn3+pearn4 
		label variable totq "Total Quarters Worked in Main Job" 

***  BLS All Urban Consumers CPI, make quarterly deflator (df1-df4);
sort year 
merge m:1 year using  /.../immig/cpi.dta
keep if _merge==3
drop _merge

		forval i=1/4 { 
			replace df`i' = 1/df`i' 
			replace earn`i' = earn`i'*df`i' 
		} 	
		drop earn_ann 
		gen earn_ann = earn1 + earn2 + earn3 + earn4 
		gen qrtearn = maxearn/totq 
		label variable qrtearn "Mean Quarterly Earnings in Main Job" 
		bysort pik year (sein): gen dup = _N 
		bysort pik year (sein): gen cnt = _n 
		drop if cnt != 1 
		drop cnt dup 
		keep pik year sein totq qrtearn totearn_all njobs_all nqtr_all state
		rename state st_lehd

sort pik year
save /.../immig/lehd/cenacs_women_ehf.dta, replace

merge m:1 pik using /.../immig/women_acsdecen_viet.dta
keep if _merge==3
drop _merge
drop age
gen age=year-dby
summ totq qrtearn if age>29&age<36 & age_at_entry<18
summ totq qrtearn if age>29&age<36 & age_at_entry>17 & age_at_entry<22
bysort pik: gen n=_n
replace n=0 if n>1
tab n if age_at_entry<18
tab n if age_at_entry>17 & age_at_entry<22
drop n
summ age if age_at_entry<18
summ age if age_at_entry>17 & age_at_entry<22
summ qrtearn if age >29&age<36 & age_at_entry<18
summ qrtearn if age >29&age<36 & age_at_entry>17 & age_at_entry<22
save /.../immig/women_acsdecen_viet_ehf.dta, replace
clear

*** Establishment / Firm Characteristics
* ECF Variables by sein - year
* Establishment size, NAICS code, average earnings (and whether > state median)
foreach st in ar az co dc de il in ia ks me md mt ne nv nm nd ok pa tn tx va wa ca {
cd /d.../lehd_s2014_ecf/`st'/
import sas sein year quarter sein_best_emp1 sein_best_emp2 sein_best_emp3 sein_best_wages num_estabs mode_es_county_emp mode_leg_county_emp mode_es_naics_fnl2012_emp using ecf_`st'_sein
drop if year < 1991
drop if year==2015

* Average quartely employment ;
gen qrtemp = sein_best_emp1 + sein_best_emp2 + sein_best_emp3
replace qrtemp = qrtemp/3

***  BLS All Urban Consumers CPI, make quarterly deflator (df1-df4);
sort year 
merge m:1 year using  /.../immig/cpi.dta
keep if _merge==3
drop _merge
	replace df = 1/df 
	replace sein_best_wages = sein_best_wages*df 
	drop df
	
gen meansein = sein_best_wages/qrtemp
replace meansein = . if qrtemp == 0
replace mode_es_county_emp=mode_leg_county_emp if mode_es_county_emp==""
drop mode_leg_county_emp
rename mode_es_county_emp county
rename mode_es_naics_fnl2012_emp naics_fnl2012

bysort sein year (quarter): gen many = _n
foreach var in county naics_fnl2012 { 
		replace `var' = "" if many > 1 
		destring `var', replace 
		bysort sein year (`var'): replace `var' = `var'[_n-1] if mi(`var') 
		} 

collapse (mean) meansein qrtemp (firstnm) county naics_fnl2012, by(sein year)

tostring county, replace
replace county = "0" + county if length(county) == 2 
replace county = "00" + county if length(county) == 1

bysort year: egen st_med=median(meansein)
sort sein year
gen sein_high=meansein>st_med 
replace sein_high=. if meansein==.
drop st_med
rename qrtemp sein_emp

save /.../immig/lehd/ecf_sein_`st'.dta, replace
clear
}

use /.../immig/lehd/ecf_sein_ar.dta 
foreach st in az ca co dc de il in ia ks me md mt ne nv nm nd ok pa tn tx va wa {
append using /.../immig/lehd/ecf_sein_`st'.dta
}
sort sein year
save /.../immig/lehd/ecf_sein_all.dta, replace
clear
foreach st in ar az ca co dc de il in ia ks me md mt ne nv nm nd ok pa tn tx va wa {
erase /.../immig/lehd/ecf_sein_`st'.dta
}

*************************************************
*************************************************
*** 01b_sein_year_vars_to_merge               ***
*************************************************
*************************************************

* SEIN - Year level variables for Vietnamese analysis
* Pull full list of main job SEINs and only save for those
*** Years 1993-2014

cd /.../immig/
use women_acsdecen_viet_ehf.dta
append using men_acsdecen_viet_ehf.dta
keep sein year
drop if year<1993
drop if year==2015
duplicates drop
sort sein year
save viet_main_job_sein_years.dta, replace
clear

* Start from EHF
* Generate average quarterly earnings over quarters worked
* Deflate earnings to 2015 values, drop 2015 as data is partial

foreach st in ar az co dc de il in ia ks me md mt ne nv nm nd ok pa tn tx va wa ca {
cd /.../lehd/ehf_dta/
!gunzip ehf_`st'.dta.gz
use ehf_`st'.dta
keep pik year earn* sein
sort sein year 
merge m:1 sein year using /.../immig/viet_main_job_sein_years.dta
keep if _merge==3
drop _merge
drop if year<1993
drop if year==2015

recode earn1 0=.
recode earn2 0=.
recode earn3 0=.
recode earn4 0=.
egen earn=rmean(earn1 earn2 earn3 earn4)
gen q1=earn1>0&earn1!=.
gen q2=earn2>0&earn2!=.
gen q3=earn3>0&earn3!=.
gen q4=earn4>0&earn4!=.
egen qtrs=rsum(q1 q2 q3 q4)
drop earn1 earn2 earn3 earn4 q1 q2 q3 q4

***  REDACTED: obtain BLS All Urban Consumers CPI, make annual deflator (deflate);

gen dearn=deflate*earn
gen dearn_ann=deflate*earn_ann
drop deflate earn earn_ann
drop if year==2015
sort pik sein year

* Create salary ranking during the year
gsort sein year -dearn
by sein year: gen rank=_n
sort pik
* Merge in ICF, keep all variables
sort pik
merge m:1 pik using /.../icf/icf23.dta
tab _merge
keep if _merge==3
drop _merge
gen dyr=year(dob)
gen age=year-dyr
drop dob dyr
gen state="`st'"

gsort sein year -dearn

* Ethnicity of top earner
sort sein year rank
gen top_eth=pob if rank==1
by sein year: replace top_eth=top_eth[_n-1] if top_eth==""

* PIK of top earner
sort sein year rank
gen top_pik=pik if rank==1
by sein year: replace top_pik=top_pik[_n-1] if top_pik==""

* # employees
bysort sein year: egen tot_emps=max(rank)

* % Vietnamese
gen viet=pob=="D"
bysort sein year: egen tot_viet=sum(viet)
gen pct_viet=tot_viet/tot_emps

* % Average wage percentile of (other) Vietnamese
gen vpct=rank/tot_emps if viet==1
bysort sein year: egen viet_pctile=mean(vpct)
drop vpct

foreach p in 10 20 30 40 50 60 70 80 90 {
	by sein year: egen p`p'=pctile(dearn), p(`p')
}

keep sein year pct_viet top_eth top_pik tot_emps viet_pctile p10 p20 p30 p40 p50 p60 p70 p80 p90
bysort sein year: gen n=_n
keep if n==1
drop n
save /.../immig/lehd/viet_main_job_sein_vars_`st'.dta, replace
clear
!gzip /.../lehd/ehf_dta/ehf_`st'.dta
}

use /.../immig/lehd/viet_main_job_sein_vars_ar.dta 
foreach st in az ca co dc de il in ia ks me md mt ne nv nm nd ok pa tn tx va wa {
append using /.../immig/lehd/viet_main_job_sein_vars_`st'.dta
}
sort sein year
save /.../immig/lehd/viet_main_job_sein_vars_all.dta, replace

*************************************************
*************************************************
*** 02_final_data_and_analysis                ***
*************************************************
*************************************************

*** FILE: Combine all data and run analysis
clear

* EHF job histories
cd /.../immig
use /.../immig/lehd/cenacs_men_ehf.dta
append using /.../immig/lehd/cenacs_women_ehf.dta
bysort pik year: gen n=_n
tab n
drop n
summ year
sort pik
* ACS / Decennial variables
merge m:1 pik using "/.../immig/both_acsdecen_variables.dta"
drop _merge
* ECF-based main job variables
sort sein year 
merge m:1 sein year using /.../immig/lehd/ecf_sein_all.dta
drop if _merge==2
tab _merge
keep if _merge==3
drop _merge
* EHF-based main job variables
sort sein year
merge m:1 sein year using /.../immig/lehd/viet_main_job_sein_vars_all.dta
keep if _merge==3
drop _merge
rename sein main_sein

* First LEHD State and county
preserve
sort pik year
bysort pik: gen n=_n
gen frst_st=st_lehd if n==1
gen frst_cty=county if n==1
keep if n==1
keep pik frst_st frst_cty
sort pik
save frst_lehd_st.dta, replace
restore

sort pik
merge m:1 pik using frst_lehd_st.dta
keep if _merge==3
drop _merge

gen age=year-dby
drop if age_at_entry>21
drop if age_at_entry<14
drop if age<18
gen young=age_at_entry>13 & age_at_entry<18
gen female=sex=="2"
destring schl, replace
gen coll=schl>12&schl!=.
gen hs=schl>8 & schl<13

* English ability (combine only English & very well, as Decennial does not separate)
destring qengabil, replace
gen vwelleng=0
replace vwelleng=1 if sourcedata=="Census"&qengabil==1
replace vwelleng=1 if sourcedata=="ACS"&(qengabil==1|qengabil==2|qengabil==6|qengabil==10|qengabil==14)
gen welleng=0
replace welleng=1 if sourcedata=="Census"&qengabil==2
replace welleng=1 if sourcedata=="ACS"&(qengabil==3|qengabil==7|qengabil==11|qengabil==15)
gen notweng=0
replace notweng=1 if sourcedata=="Census"&qengabil==3
replace notweng=1 if sourcedata=="ACS"&(qengabil==4|qengabil==8|qengabil==12|qengabil==16)
gen noeng=0
replace noeng=1 if sourcedata=="Census"&qengabil==4
replace noeng=1 if sourcedata=="ACS"&(qengabil==5|qengabil==9|qengabil==13|qengabil==17)
egen styr=group(st_lehd year)

gen young9394=young==1&(year==1993|year==1994)
gen young9599=young==1&year>1994&year<2000
gen young0004=young==1&year>1999&year<2005
gen young0509=young==1&year>2004&year<2010
gen young1014=young==1&year>2009&year<2015

forval yr=1993/2014 {
gen yr`yr'=year==`yr'
}

forval yr=1993/2014 {
gen age`yr'=age*yr`yr'
}

forval yr=1993/2014 {
gen female`yr'=female*yr`yr'
}

forval yr=1993/2014 {
gen hs`yr'=hs*yr`yr'
}

forval yr=1993/2014 {
gen coll`yr'=coll*yr`yr'
}

forval yr=1993/2014 {
gen vwelleng`yr'=vwelleng*yr`yr'
}

gen early_states=(st_lehd=="md"|st_lehd=="co"|st_lehd=="il"|st_lehd=="in"|st_lehd=="ks"|st_lehd=="wa"|st_lehd=="ca"|st_lehd=="pa"|st_lehd=="az"|st_lehd=="mt"|st_lehd=="nm"|st_lehd=="tx")

* Gap between first LEHD year and year of US arrival
bysort pik: egen first_lehd=min(year)
gen gap_us=first_lehd-qyr2us
preserve
bysort pik: gen n=_n
tab qyr2us if coll==1 & young==1 & n==1
tab qyr2us if coll==1 & young==0 & n==1
tab qyr2us if hs==1 & young==1 & n==1
tab qyr2us if hs==1 & young==0 & n==1
keep young coll hs gap_us pik
duplicates drop
gen count=1
collapse (sum) count, by(young coll hs gap_us)
sort young coll hs gap_us
list gap_us count if coll==1 & young==1
list gap_us count if coll==1 & young==0
list gap_us count if hs==1 & young==1
list gap_us count if hs==1 & young==0
restore

*** Cluster sites by State & county
*** Information from publicly available documentation of this American Homecoming Act related episode;
*** SOURCE: Chung Hoang Chuong & Le Van (1994) "The Amerasians from Vietnam: A California Study" (p. 83-87);

gen cluster = 0
replace cluster=1 if frst_st=="ar" & frst_cty=="035"

replace cluster=1 if frst_st=="az" & frst_cty=="013"
replace cluster=1 if frst_st=="az" & frst_cty=="019"
replace cluster=1 if frst_st=="az" & frst_cty=="021"

replace cluster=1 if frst_st=="ca" & frst_cty=="017"
replace cluster=1 if frst_st=="ca" & frst_cty=="037"
replace cluster=1 if frst_st=="ca" & frst_cty=="041"
replace cluster=1 if frst_st=="ca" & frst_cty=="059"
replace cluster=1 if frst_st=="ca" & frst_cty=="061"
replace cluster=1 if frst_st=="ca" & frst_cty=="067"
replace cluster=1 if frst_st=="ca" & frst_cty=="073"
replace cluster=1 if frst_st=="ca" & frst_cty=="075"
replace cluster=1 if frst_st=="ca" & frst_cty=="081"
replace cluster=1 if frst_st=="ca" & frst_cty=="085"

replace cluster=1 if frst_st=="co" & frst_cty=="001"
replace cluster=1 if frst_st=="co" & frst_cty=="005"
replace cluster=1 if frst_st=="co" & frst_cty=="014"
replace cluster=1 if frst_st=="co" & frst_cty=="031"
replace cluster=1 if frst_st=="co" & frst_cty=="035"
replace cluster=1 if frst_st=="co" & frst_cty=="059"

replace cluster=1 if frst_st=="dc" & frst_cty=="001"

replace cluster=1 if frst_st=="ia" & frst_cty=="049"
replace cluster=1 if frst_st=="ia" & frst_cty=="113"
replace cluster=1 if frst_st=="ia" & frst_cty=="153"
replace cluster=1 if frst_st=="ia" & frst_cty=="163"
replace cluster=1 if frst_st=="ia" & frst_cty=="181"
replace cluster=1 if frst_st=="ia" & frst_cty=="193"

replace cluster=1 if frst_st=="il" & frst_cty=="027"
replace cluster=1 if frst_st=="il" & frst_cty=="031"
replace cluster=1 if frst_st=="il" & frst_cty=="037"
replace cluster=1 if frst_st=="il" & frst_cty=="043"
replace cluster=1 if frst_st=="il" & frst_cty=="063"
replace cluster=1 if frst_st=="il" & frst_cty=="073"
replace cluster=1 if frst_st=="il" & frst_cty=="083"
replace cluster=1 if frst_st=="il" & frst_cty=="089"
replace cluster=1 if frst_st=="il" & frst_cty=="093"
replace cluster=1 if frst_st=="il" & frst_cty=="097"
replace cluster=1 if frst_st=="il" & frst_cty=="111"
replace cluster=1 if frst_st=="il" & frst_cty=="119"
replace cluster=1 if frst_st=="il" & frst_cty=="129"
replace cluster=1 if frst_st=="il" & frst_cty=="133"
replace cluster=1 if frst_st=="il" & frst_cty=="161"
replace cluster=1 if frst_st=="il" & frst_cty=="163"
replace cluster=1 if frst_st=="il" & frst_cty=="167"
replace cluster=1 if frst_st=="il" & frst_cty=="197"

replace cluster=1 if frst_st=="in" & frst_cty=="019"
replace cluster=1 if frst_st=="in" & frst_cty=="029"
replace cluster=1 if frst_st=="in" & frst_cty=="043"
replace cluster=1 if frst_st=="in" & frst_cty=="115"
replace cluster=1 if frst_st=="in" & frst_cty=="061"
replace cluster=1 if frst_st=="in" & frst_cty=="143"

replace cluster=1 if frst_st=="ks" & frst_cty=="091"
replace cluster=1 if frst_st=="ks" & frst_cty=="103"
replace cluster=1 if frst_st=="ks" & frst_cty=="121"
replace cluster=1 if frst_st=="ks" & frst_cty=="209"

replace cluster=1 if frst_st=="md" & frst_cty=="009"
replace cluster=1 if frst_st=="md" & frst_cty=="017"
replace cluster=1 if frst_st=="md" & frst_cty=="021"
replace cluster=1 if frst_st=="md" & frst_cty=="031"
replace cluster=1 if frst_st=="md" & frst_cty=="033"

replace cluster=1 if frst_st=="me" & frst_cty=="005"
replace cluster=1 if frst_st=="me" & frst_cty=="023"
replace cluster=1 if frst_st=="me" & frst_cty=="031"

replace cluster=1 if frst_st=="ne" & frst_cty=="043"
replace cluster=1 if frst_st=="ne" & frst_cty=="109"

replace cluster=1 if frst_st=="nd" & frst_cty=="017"

replace cluster=1 if frst_st=="ok" & frst_cty=="017"
replace cluster=1 if frst_st=="ok" & frst_cty=="027"
replace cluster=1 if frst_st=="ok" & frst_cty=="083"
replace cluster=1 if frst_st=="ok" & frst_cty=="087"
replace cluster=1 if frst_st=="ok" & frst_cty=="109"
replace cluster=1 if frst_st=="ok" & frst_cty=="125"

replace cluster=1 if frst_st=="pa" & frst_cty=="003"
replace cluster=1 if frst_st=="pa" & frst_cty=="007"
replace cluster=1 if frst_st=="pa" & frst_cty=="017"
replace cluster=1 if frst_st=="pa" & frst_cty=="019"
replace cluster=1 if frst_st=="pa" & frst_cty=="029"
replace cluster=1 if frst_st=="pa" & frst_cty=="045"
replace cluster=1 if frst_st=="pa" & frst_cty=="049"
replace cluster=1 if frst_st=="pa" & frst_cty=="051"
replace cluster=1 if frst_st=="pa" & frst_cty=="091"
replace cluster=1 if frst_st=="pa" & frst_cty=="101"
replace cluster=1 if frst_st=="pa" & frst_cty=="125"
replace cluster=1 if frst_st=="pa" & frst_cty=="129"

replace cluster=1 if frst_st=="tn" & frst_cty=="021"
replace cluster=1 if frst_st=="tn" & frst_cty=="037"
replace cluster=1 if frst_st=="tn" & frst_cty=="043"
replace cluster=1 if frst_st=="tn" & frst_cty=="047"
replace cluster=1 if frst_st=="tn" & frst_cty=="147"
replace cluster=1 if frst_st=="tn" & frst_cty=="149"
replace cluster=1 if frst_st=="tn" & frst_cty=="157"
replace cluster=1 if frst_st=="tn" & frst_cty=="165"
replace cluster=1 if frst_st=="tn" & frst_cty=="167"
replace cluster=1 if frst_st=="tn" & frst_cty=="187"
replace cluster=1 if frst_st=="tn" & frst_cty=="189"

replace cluster=1 if frst_st=="tx" & frst_cty=="071"
replace cluster=1 if frst_st=="tx" & frst_cty=="085"
replace cluster=1 if frst_st=="tx" & frst_cty=="113"
replace cluster=1 if frst_st=="tx" & frst_cty=="121"
replace cluster=1 if frst_st=="tx" & frst_cty=="139"
replace cluster=1 if frst_st=="tx" & frst_cty=="157"
replace cluster=1 if frst_st=="tx" & frst_cty=="199"
replace cluster=1 if frst_st=="tx" & frst_cty=="201"
replace cluster=1 if frst_st=="tx" & frst_cty=="213"
replace cluster=1 if frst_st=="tx" & frst_cty=="221"
replace cluster=1 if frst_st=="tx" & frst_cty=="231"
replace cluster=1 if frst_st=="tx" & frst_cty=="245"
replace cluster=1 if frst_st=="tx" & frst_cty=="251"
replace cluster=1 if frst_st=="tx" & frst_cty=="257"
replace cluster=1 if frst_st=="tx" & frst_cty=="291"
replace cluster=1 if frst_st=="tx" & frst_cty=="339"
replace cluster=1 if frst_st=="tx" & frst_cty=="361"
replace cluster=1 if frst_st=="tx" & frst_cty=="367"
replace cluster=1 if frst_st=="tx" & frst_cty=="397"
replace cluster=1 if frst_st=="tx" & frst_cty=="439"
replace cluster=1 if frst_st=="tx" & frst_cty=="473"

replace cluster=1 if frst_st=="va" & frst_cty=="013"
replace cluster=1 if frst_st=="va" & frst_cty=="023"
replace cluster=1 if frst_st=="va" & frst_cty=="036"
replace cluster=1 if frst_st=="va" & frst_cty=="041"
replace cluster=1 if frst_st=="va" & frst_cty=="043"
replace cluster=1 if frst_st=="va" & frst_cty=="047"
replace cluster=1 if frst_st=="va" & frst_cty=="053"
replace cluster=1 if frst_st=="va" & frst_cty=="059"
replace cluster=1 if frst_st=="va" & frst_cty=="061"
replace cluster=1 if frst_st=="va" & frst_cty=="075"
replace cluster=1 if frst_st=="va" & frst_cty=="085"
replace cluster=1 if frst_st=="va" & frst_cty=="087"
replace cluster=1 if frst_st=="va" & frst_cty=="099"
replace cluster=1 if frst_st=="va" & frst_cty=="107"
replace cluster=1 if frst_st=="va" & frst_cty=="127"
replace cluster=1 if frst_st=="va" & frst_cty=="145"
replace cluster=1 if frst_st=="va" & frst_cty=="149"
replace cluster=1 if frst_st=="va" & frst_cty=="153"
replace cluster=1 if frst_st=="va" & frst_cty=="161"
replace cluster=1 if frst_st=="va" & frst_cty=="177"
replace cluster=1 if frst_st=="va" & frst_cty=="179"
replace cluster=1 if frst_st=="va" & frst_cty=="187"
replace cluster=1 if frst_st=="va" & frst_cty=="510"
replace cluster=1 if frst_st=="va" & frst_cty=="570"
replace cluster=1 if frst_st=="va" & frst_cty=="600"
replace cluster=1 if frst_st=="va" & frst_cty=="610"
replace cluster=1 if frst_st=="va" & frst_cty=="630"
replace cluster=1 if frst_st=="va" & frst_cty=="670"
replace cluster=1 if frst_st=="va" & frst_cty=="683"
replace cluster=1 if frst_st=="va" & frst_cty=="685"
replace cluster=1 if frst_st=="va" & frst_cty=="730"
replace cluster=1 if frst_st=="va" & frst_cty=="760"
replace cluster=1 if frst_st=="va" & frst_cty=="770"
replace cluster=1 if frst_st=="va" & frst_cty=="775"

replace cluster=1 if frst_st=="wa" & frst_cty=="011"
replace cluster=1 if frst_st=="wa" & frst_cty=="029"
replace cluster=1 if frst_st=="wa" & frst_cty=="033"
replace cluster=1 if frst_st=="wa" & frst_cty=="053"
replace cluster=1 if frst_st=="wa" & frst_cty=="061"

drop if st_lehd=="mt"
gen lnearn=ln(qrtearn)

* Winsorize totearn_all for the top 1% of observations
* Drop earnings below $250 (in the analyses below)
summ totearn_all if age_at_entry<22 & year>1992 & early_states==1 & qrtearn>=250, detail
winsor totearn_all, p(0.01) gen(totearn_topcoded)
replace totearn_all=totearn_topcoded if totearn_all>totearn_topcoded & totearn_all!=.
gen lnearn_all=ln(totearn_all)

*** App Table 9:
* A: Log annual earnings across all jobs
* Col 1: base controls
areg lnearn_all young9394 young9599 young0004 young0509 young1014 female1993-female2014 age1993-age2014 if age_at_entry<22 & year>1992 & early_states==1 & qrtearn>=250, absorb(styr) vce(cluster st_lehd)

* Col 2: Add college * year, HS * year, and fluency * year 
areg lnearn_all young9394 young9599 young0004 young0509 young1014 female1993-female2014 age1993-age2014 coll1993-coll2014 hs1993-hs2014 vwelleng1993-vwelleng2014 if age_at_entry<22 & year>1992 & early_states==1 & qrtearn>=250, absorb(styr) vce(cluster st_lehd)
   *** Check if education or fluency matter more
areg lnearn_all young9394 young9599 young0004 young0509 young1014 female1993-female2014 age1993-age2014 coll1993-coll2014 hs1993-hs2014 if age_at_entry<22 & year>1992 & early_states==1 & qrtearn>=250, absorb(styr) vce(cluster st_lehd)
areg lnearn_all young9394 young9599 young0004 young0509 young1014 female1993-female2014 age1993-age2014 vwelleng1993-vwelleng2014 if age_at_entry<22 & year>1992 & early_states==1 & qrtearn>=250, absorb(styr) vce(cluster st_lehd)

* Col 4: Restrict Col 4 to cluster sites: all controls
areg lnearn_all young9394 young9599 young0004 young0509 young1014 female1993-female2014 age1993-age2014 coll1993-coll2014 hs1993-hs2014 vwelleng1993-vwelleng2014 if age_at_entry<22 & year>1992 & early_states==1 & gap_us<4 & qrtearn>=250 & cluster==1, absorb(styr) vce(cluster st_lehd)
gen sample2=e(sample)

*** Col 3: Repeat without the extra controls (Base Specification)
areg lnearn_all young9394 young9599 young0004 young0509 young1014 female1993-female2014 age1993-age2014 if age_at_entry<22 & year>1992 & early_states==1 & gap_us<4 & qrtearn>=250 & cluster==1, absorb(styr) vce(cluster st_lehd)

* B: Raw annual earnings across all jobs
* Col 1: base controls
areg totearn_all young9394 young9599 young0004 young0509 young1014 female1993-female2014 age1993-age2014 if age_at_entry<22 & year>1992 & early_states==1 & qrtearn>=250, absorb(styr) vce(cluster st_lehd)

* Col 2: Add college * year, HS * year, and fluency * year 
areg totearn_all young9394 young9599 young0004 young0509 young1014 female1993-female2014 age1993-age2014 coll1993-coll2014 hs1993-hs2014 vwelleng1993-vwelleng2014 if age_at_entry<22 & year>1992 & early_states==1 & qrtearn>=250, absorb(styr) vce(cluster st_lehd)
   *** Check if education or fluency matter more
areg totearn_all young9394 young9599 young0004 young0509 young1014 female1993-female2014 age1993-age2014 coll1993-coll2014 hs1993-hs2014 if age_at_entry<22 & year>1992 & early_states==1 & qrtearn>=250, absorb(styr) vce(cluster st_lehd)
areg totearn_all young9394 young9599 young0004 young0509 young1014 female1993-female2014 age1993-age2014 vwelleng1993-vwelleng2014 if age_at_entry<22 & year>1992 & early_states==1 & qrtearn>=250, absorb(styr) vce(cluster st_lehd)

* Col 4: Restrict Col 4 to cluster sites, all controls
areg totearn_all young9394 young9599 young0004 young0509 young1014 female1993-female2014 age1993-age2014 coll1993-coll2014 hs1993-hs2014 vwelleng1993-vwelleng2014 if age_at_entry<22 & year>1992 & early_states==1 & gap_us<4 & qrtearn>=250 & cluster==1, absorb(styr) vce(cluster st_lehd)

*** Col 3: Repeat col 4 without the extra controls (Base Specification)
areg totearn_all young9394 young9599 young0004 young0509 young1014 female1993-female2014 age1993-age2014 coll1993-coll2014 if age_at_entry<22 & year>1992 & early_states==1 & gap_us<4 & qrtearn>=250 & cluster==1, absorb(styr) vce(cluster st_lehd)

**********************
* Prepare data
drop age1993-age2014 yr1993-yr2014 young9394 young9599 young0004 young0509 young1014 yr1993 female1993-female2014 age1993-age2014 coll1993-coll2014 hs1993-hs2014 vwelleng1993-vwelleng2014 
drop sex dby qrace1 qpobst qcitizen qwklyrwk qwklyrhr qincwg qincse qinctot rel qspan qms schl qmig qmigpl qmigst esr qwklwk qlayoff qabsent qrecall qlookwk qbackwk qlastwk ind occ qcow msp chtot state pseq sedf year_obs qspeak qengabil
drop welleng notweng noeng styr cmid pnum sourcedata

gen nail = naics_fnl2012==812113
gen is_top = pik==top_pik
gen eth_top=top_eth=="D"
sort pik year main_sein
gen small=sein_emp<51
sort pik year
by pik: gen first_job=_n
replace first_job=0 if first_job>1
gen fjob_grp=4 if first_job==1 & eth_top==1 & small==1
replace fjob_grp=3 if first_job==1 & eth_top==1 & small==0
replace fjob_grp=2 if first_job==1 & eth_top==0 & small==1
replace fjob_grp=1 if first_job==1 & eth_top==0 & small==0
bysort pik: replace fjob_grp=fjob_grp[_n-1] if fjob_grp==.
drop small first_job

gen naics6=naics_fnl2012
tostring naics_fnl2012, replace
gen naics5 = substr(naics_fnl2012,1,5)
gen naics4 = substr(naics_fnl2012,1,4)
destring naics4, replace
destring naics5, replace

* High-tech NAICS codes per the NSF definition (publicly available);
* SOURCE: Hecker (2005): "High-technology employment: a NAICS based update", BLS Monthly Labor Review.
gen nsf = 0
replace nsf = 1 if naics4 == 3251
replace nsf = 1 if naics4 == 3252
replace nsf = 1 if naics4 == 3253
replace nsf = 1 if naics4 == 3254
replace nsf = 1 if naics4 == 3255
replace nsf = 1 if naics4 == 3256
replace nsf = 1 if naics4 == 3259
replace nsf = 1 if naics4 == 3331
replace nsf = 1 if naics4 == 3332
replace nsf = 1 if naics4 == 3333
replace nsf = 1 if naics4 == 3336
replace nsf = 1 if naics4 == 3339
replace nsf = 1 if naics4 == 3341
replace nsf = 1 if naics4 == 3342
replace nsf = 1 if naics4 == 3343
replace nsf = 1 if naics4 == 3344
replace nsf = 1 if naics4 == 3345
replace nsf = 1 if naics4 == 3346
replace nsf = 1 if naics4 == 3353
replace nsf = 1 if naics4 == 3361
replace nsf = 1 if naics4 == 3362
replace nsf = 1 if naics4 == 3363
replace nsf = 1 if naics4 == 3364
replace nsf = 1 if naics4 == 3391
replace nsf = 1 if naics4 == 5112
replace nsf = 1 if naics4 == 5142
replace nsf = 1 if naics4 == 5413
replace nsf = 1 if naics4 == 5415
replace nsf = 1 if naics4 == 5416
replace nsf = 1 if naics4 == 5417
replace nsf = 1 if naics4 == 6117
replace nsf = 1 if naics5 == 32411 
replace nsf = 1 if naics5 == 33599 		
replace nsf = 1 if naics6 == 514191
replace nsf = 1 if naics6 == 811212
replace nsf = 1 if naics6 == 332992
replace nsf = 1 if naics6 == 332993
replace nsf = 1 if naics6 == 332994
replace nsf = 1 if naics6 == 332995

* Limit to the 2000 - 2014 period
keep if year>1999 & year < 2015

gen pctile=10 if qrtearn<=p10
replace pctile=20 if qrtearn>p10 & qrtearn<=p20
replace pctile=30 if qrtearn>p20 & qrtearn<=p30
replace pctile=40 if qrtearn>p30 & qrtearn<=p40
replace pctile=50 if qrtearn>p40 & qrtearn<=p50
replace pctile=60 if qrtearn>p50 & qrtearn<=p60
replace pctile=70 if qrtearn>p60 & qrtearn<=p70
replace pctile=80 if qrtearn>p70 & qrtearn<=p80
replace pctile=90 if qrtearn>p80 & qrtearn<=p90
replace pctile=100 if qrtearn>p90 & qrtearn!=.

gen temp1=qrtearn if year>=2000 & year<=2002
bysort pik: egen mean2000_02=mean(temp1)
gen temp2=meansein if year>=2000 & year<=2002
bysort pik: egen meansein2000_02=mean(temp2)

gen temp3=qrtearn if year>=2012 & year<=2014
bysort pik: egen mean2012_14=mean(temp3)
gen temp4=meansein if year>=2012 & year<=2014
bysort pik: egen meansein2012_14=mean(temp4)

drop temp1 temp2 temp3 temp4

* Establishment size bins
gen est_0_20=sein_emp<21
gen est_21_1000=sein_emp>=21 & sein_emp < 1001
gen est_1001=sein_emp>=1001

* Number of quarters worked by situation
foreach var in nail nsf is_top eth_top est_0_20 est_21_1000 est_1001 sein_high {
replace `var'=`var' * totq
}

* Quarters of "potential availability" in LEHD
sort pik year
by pik: egen first_year=min(year)
gen pot_qtrs_lehd=4+(2014-first_year)*4
by pik: egen all_qtrs_emp=sum(nqtr_all)
gen sh_qtrs_emp=all_qtrs_emp/pot_qtrs_lehd

by pik: gen n=_n 
gen st2000=st_lehd if n==1
by pik: replace st2000=st2000[_n-1] if st2000=="" & n>1
drop n

* Annual earnings change: a) declines by 20% of more, b) increased 20% or more, and c) in between
sort pik year
by pik: gen l_totearn_all=totearn_all[_n-1]
by pik: gen gap=year-year[_n-1]
gen ch_totearn_all=(totearn_all-l_totearn_all)/l_totearn_all
replace ch_totearn_all=. if gap>1
summ ch_totearn_all

gen ch20minus=ch_totearn_all<-0.2
gen ch20plus=ch_totearn_all>0.2 & ch_totearn_all!=.
gen chmoderate=ch_totearn_all>=-0.2 & ch_totearn_all<=0.2
gen gap_over1=gap>1
egen tot_ch=rsum(ch20minus ch20plus chmoderate gap_over1)

bysort pik main_sein: gen seins=_n
replace seins=0 if seins>1

*** Sample criteria
keep if age_at_entry<22
keep if young==1|young==0
drop if qrtearn<250
bysort pik: egen mean_qrtearn=mean(qrtearn)

collapse (sum) totq seins nail nsf is_top eth_top est_0_20 est_21_1000 est_1001 sein_high ch20minus ch20plus chmoderate gap_over1 tot_ch (mean) qrtearn lnearn pctile pct_eth mean2000_02 meansein2000_02 mean2012_14 meansein2012_14 age (max) sh_qtrs_emp cluster, by(pik young female hs coll gap_us vwelleng st2000 fjob_grp)
foreach var in nail nsf is_top eth_top est_0_20 est_21_1000 est_1001 sein_high {
replace `var'=`var'/totq
}
gen dur=totq/seins
gen earn_ch=(mean2012_14-mean2000_02)/mean2000_02
gen sein_ch=(meansein2012_14-meansein2000_02)/meansein2000_02
gen pct_20minus=ch20minus/tot_ch
gen pct_20plus=ch20plus/tot_ch
gen pct_moderate=chmoderate/tot_ch
gen pct_gap_over1=gap_over1/tot_ch

replace earn_ch=0 if earn_ch==.
replace sein_ch=0 if sein_ch==.

*** Table 2 and Appendix Tables 10 and 11
summ sh_qtrs_emp est_0_20 est_21_1000 est_1001 nail nsf sein_high is_top eth_top qrtearn lnearn pctile pct_eth seins dur earn_ch pct_20minus pct_20plus pct_moderate pct_gap_over1 sein_ch if young==1 & qrtearn>=250
summ sh_qtrs_emp est_0_20 est_21_1000 est_1001 nail nsf sein_high is_top eth_top qrtearn lnearn pctile pct_eth seins dur earn_ch pct_20minus pct_20plus pct_moderate pct_gap_over1 sein_ch if young==0 & qrtearn>=250

foreach var in sh_qtrs_emp est_0_20 est_21_1000 est_1001 nail nsf sein_high is_top eth_top qrtearn lnearn pctile pct_eth seins dur earn_ch pct_20minus pct_20plus pct_moderate pct_gap_over1 sein_ch {
areg `var' young female age if qrtearn>=250, absorb (st2000) vce(cluster st2000)
}

* Fluency & Education
foreach var in sh_qtrs_emp est_0_20 est_21_1000 est_1001 nail nsf sein_high is_top eth_top qrtearn lnearn pctile pct_eth seins dur earn_ch pct_20minus pct_20plus pct_moderate pct_gap_over1 sein_ch {
areg `var' young female age hs coll vwelleng if qrtearn>=250, absorb (st2000) vce(cluster st2000)
}

* First Employer Traits & First Observed at Cluster Site
foreach var in sh_qtrs_emp est_0_20 est_21_1000 est_1001 nail nsf sein_high is_top eth_top qrtearn lnearn pctile pct_eth seins dur earn_ch pct_20minus pct_20plus pct_moderate pct_gap_over1 sein_ch {
areg `var' young female age hs coll vwelleng cluster i.fjob_grp if qrtearn>=250, absorb (st2000) vce(cluster st2000)
}

* Winsorize the change variables, limit to earnings > $250
summ earn_ch, detail
summ sein_ch, detail
winsor earn_ch, p(0.01) gen(winsor_earnch)
winsor sein_ch, p(0.01) gen(winsor_seinch)
replace earn_ch=winsor_earnch if earn_ch>winsor_earnch & earn_ch!=.
replace earn_ch=winsor_earnch if earn_ch<winsor_earnch & earn_ch!=.
replace sein_ch=winsor_seinch if sein_ch>winsor_seinch & sein_ch!=.
replace sein_ch=winsor_seinch if sein_ch<winsor_seinch & sein_ch!=.
summ earn_ch sein_ch if young==1 & qrtearn>=250
summ earn_ch sein_ch if young==0 & qrtearn>=250

foreach var in earn_ch sein_ch {
areg `var' young female age if qrtearn>=250, absorb (st2000) vce(cluster st2000)
}

foreach var in earn_ch sein_ch {
areg `var' young female age hs coll vwelleng if qrtearn>=250, absorb (st2000) vce(cluster st2000)
}

foreach var in earn_ch sein_ch {
areg `var' young female age hs coll vwelleng cluster i.fjob_grp if qrtearn>=250, absorb (st2000) vce(cluster st2000)
}

*** Table 5
cap n drop _I*
xi i.fjob_grp
renpfix _I ZZ
xi i.st2000

* Control for First Employer Traits & First Observed in a Cluster Site
* All educations together, but control for level of education and English fluency, first job type and cluster site
foreach var in qrtearn lnearn {
b1x2 `var' if qrtearn>=250, x1all(young female age _I*) x2all(hs coll vwelleng cluster ZZ*) x1only(young) x2delta(g1=vwelleng : g2=hs coll : g3=cluster : g4=ZZ*) cl(st2000) nobase nofull
}
